import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
sns.set_theme(style="whitegrid")
# log10(flow) ~ prop_users_dest + prop_users_orig + query_date + log10(distance) + indicator_vars
fit_df = pd.read_csv("/Users/scharlottej13/Nextcloud/linkedin_recruiter/outputs/model_output_2021-02-09.csv")
# df = pd.read_csv("N:/johnson/linkedin_recruiter/outputs/model_output_2021-02-09.csv")
# residuals vs. log10(distance)
fig = px.scatter(
fit_df.sort_values(by='query_date'), x=np.log10(fit_df['distance']), y='resids', color='dest_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'],
labels={'x': 'log10(distance [km])'})
fig.add_hline(y=0)
fig.show()
# residuals vs. proportion of linkedin users in destination
fig = px.scatter(fit_df.sort_values(by='query_date'), x='prop_users_dest', y='resids', color='dest_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'])
fig.add_hline(y=0)
fig.show()
# residuals vs. proportion of linkedin users in origin
fig = px.scatter(fit_df.sort_values(by='query_date'), x='prop_users_orig', y='resids', color='orig_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'])
fig.add_hline(y=0)
fig.show()
# residuals vs. predictions
fig = px.scatter(
fit_df.sort_values(by='query_date'), x='preds', y=fit_df['resids'].abs(), color='orig_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'],
labels = {'y': 'abs(residuals)'})
fig.show()
# standardized residuals, "number of standard errors away from regression line"
# quantify how large the residuals are in standard deviation units
# residuals / standard deviation
# abs(std resid) > 3 is sometimes a threshold for outlier detection
# first, very positive residuals (underestimate of y)
fit_df.loc[fit_df['sresids'].abs() > 3].sort_values(
by='sresids', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(10)
| country_orig | country_dest | |
|---|---|---|
| 20199 | Netherlands | Curacao |
| 29542 | Spain | Grenada |
| 22201 | France | New Caledonia |
| 25122 | France | French Guiana |
| 19156 | Netherlands | Suriname |
| 18774 | Netherlands | Aruba |
| 25609 | France | Reunion |
| 17340 | Brazil | Portugal |
| 20275 | France | French Polynesia |
| 25533 | France | Mayotte |
# and the very negative residuals (overestimate of y)
fit_df.loc[fit_df['sresids'].abs() > 3].sort_values(by='sresids')[['country_orig', 'country_dest']].drop_duplicates().head(10)
| country_orig | country_dest | |
|---|---|---|
| 17950 | Jamaica | Cuba |
| 25547 | Kenya | Mayotte |
| 25611 | South Africa | Reunion |
| 17441 | Venezuela | Saint Lucia |
| 31705 | Jamaica | Dominican Republic |
| 9887 | Senegal | Sierra Leone |
| 22264 | Fiji | New Caledonia |
| 20334 | New Zealand | French Polynesia |
| 2715 | Kenya | South Sudan |
| 36083 | Saudi Arabia | Egypt |
# number of predictors
P = 390
# number of observations
n = len(fit_df)
# hat-values, commonly used to measure leverage
hat_threshold = (2 * (P + 1))/n
print(hat_threshold)
# "influential" country pairs
fit_df.loc[fit_df['hat_values'] > hat_threshold].sort_values(
by='hat_values', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(20)
0.01928483353884094
| country_orig | country_dest | |
|---|---|---|
| 1524 | Mongolia | Kyrgyzstan |
| 21713 | Iceland | Zimbabwe |
| 16143 | Guinea-Bissau | Cape Verde |
| 40349 | Isle of Man | Gibraltar |
| 39335 | Gibraltar | Malta |
| 38205 | Gibraltar | Isle of Man |
| 20246 | Aruba | Curacao |
| 40469 | Nauru | Tonga |
| 17715 | Montserrat | Anguilla |
| 16408 | Cape Verde | Guinea-Bissau |
| 7772 | Tuvalu | Fiji |
| 38288 | Tuvalu | Nauru |
| 22289 | French Polynesia | New Caledonia |
| 8735 | Seychelles | Maldives |
| 10956 | Cook Islands | Samoa |
| 22527 | Cook Islands | Tuvalu |
| 5740 | Comoros | Madagascar |
| 31744 | Puerto Rico | Dominican Republic |
| 12485 | Albania | Moldova |
| 13959 | Equatorial Guinea | Gabon |
# Cook's Distance is a combination of standardized residuals & leverage
cooks_threshold = 4 / (n - P - 1)
print(cooks_threshold)
# "influential" country pairs
fit_df.loc[fit_df['cooks_dist'] > cooks_threshold].sort_values(
by='cooks_dist', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(20)
9.960407380661868e-05
| country_orig | country_dest | |
|---|---|---|
| 25609 | France | Reunion |
| 20200 | Netherlands | Curacao |
| 22196 | France | New Caledonia |
| 11432 | Cambodia | Vietnam |
| 6990 | Tajikistan | Afghanistan |
| 25533 | France | Mayotte |
| 10440 | Cambodia | Thailand |
| 29542 | Spain | Grenada |
| 25123 | France | French Guiana |
| 39335 | Gibraltar | Malta |
| 25179 | Mayotte | French Guiana |
| 16136 | Angola | Cape Verde |
| 20234 | Suriname | Curacao |
| 20274 | France | French Polynesia |
| 25547 | Kenya | Mayotte |
| 25611 | South Africa | Reunion |
| 40253 | Czech Republic | Slovakia |
| 19267 | French Guiana | Suriname |
| 22264 | Fiji | New Caledonia |
| 18832 | Suriname | Aruba |
# using standard residuals, cook's distance, & hat values
fig = px.scatter(
fit_df.sort_values(by='query_date'), x='hat_values', y='sresids', size=np.sqrt(fit_df['cooks_dist']*10),
color='orig_reg', facet_col='query_date', facet_col_wrap=3,
hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'hat values', 'sresids': 'std. residuals'})
fig.show()
# let's zoom in on one date
sub_df = fit_df.query("query_date == '2020-11-19'")
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='orig_reg', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()